##  MID.CD.newcases.2b02.pl
##
##  This Perl program uses the *.nlp.vocab.txt file output from MID.NLP.filter.1b* and a  
##  set of news stories downloaded from Factiva, and generates records that can be used
##  as input to the svm_light classifier (http://svmlight.joachims.org/). 
##
##  RUNNING THE PROGRAM
##
##  Put the program in the folder with all of the files, then
##
##     perl MID.NLP.newcases.2b02.pl 
##
##  The program provides ample feedback to the screen as it is running, so if it hangs up
##  somewhere (hasn't happened yet...) you'll know.  Run time is on the order of 40,000 
##  stories per hour.
##
##  INPUT FILES
##
##  class.filter.files
##  List of the Factiva files that are to be processed.  
##
##  Example:
##  
##     News and Information from Factiva(57).txt
##     News and Information from Factiva(56).txt
##     News and Information from Factiva(55).txt
##     News and Information from Factiva(54).txt
##
##  This is easily generated by using "ls > class.filter.files" and then editing out files that
##  do not contain reports.
##
##  *.vocab.txt
##  Ordered list of words (common English words) that were used in the classifier; this is
##  output from MID.NLP.filter.1b*. This is hardcoded in the variable $word_file.
##
##  CountryCodes.MIDNLP.txt
##  This is a modification of the xml file that is used as the input to Actor_Filter that 
##  contains country and nationality names and is used to detect dyads. This is hardcoded
##  in the variable $nat_file
##
##  Factiva reports
##  These are emailed from the Factiva system, and consist of the stories separated by
##  -----------------. The first two lines are used as an ID string.
##
##  OUTPUT FILES 
##
##  $prefix.new.output.txt: input to the svm_light system. An identification block with the
##                       tab-delimited fields Factiva ID, headline, and codes for first two 
##                       nations found is appended to this 
##
##  $prefix.domestic.cases.txt: list of cases where only a single actor was found in the first
##                      six lines.
##
##  $prefix.reject.cases.txt: lists of Factiva header and weather report cases
##                    <08.12.06>: Vladimir's new filters that are upstream from this make
##                    this unnecessary; probably should remove this
##
##  $prefix.natns.cases.txt: list of the dyads found
##
##
##  PROGRAMMING NOTES: 
## 
##  1. The 'kz' and 'kb' counters in the main text processing loop can be used to just process the 
##     first few records in each file when testing the program; it is currently set just to do this
##     for the more frequent true reports.
##  
##  2. <08.12.06> Some of the texts are ALL UPPER CASE: the current word list is not getting
##     these though it would be easy enough to add that if it is needed. Also added the lower-
##     case, upper case, and possessive to the country names.
##
##  3. This is a working program that was modified incrementally over a couple of years and 
##     different input formats, so ifsomething appears to contradict the documentation, it
##     probably does.
##
##  4. Files in class.filter.files that do not have names generated by dups.pl, defined as
##
##     	if (($file !~ m/0$/) || ($file !~ m/\.\d/)) { next;}
##
##     are skipped; this intended to eliminate the need to edit the ls > class.filter.files
##
##  5. At the end of the program, class.filter.files is replaced by a file that contains only
##     the file names satisfying the above, thus giving later programs a clean file list to 
##     work with.
##
##  SYSTEM REQUIREMENTS
##
##  This program has been successfully run under Mac OS 10.4/5; it is standard perl
##  so it should also run in Unix or Windows. 
##
##  PROVENANCE:
##  Programmer: Philip A. Schrodt
##              Dept of Political Science
##              Pennsylvania State University
##              227 Pond Laboratory
##	            University Park, PA, 16802 U.S.A.
##	            http://eventdata.psu.edu
##
##  Programming supported by National Science Foundation Political Science Program Grant
##  SES-0719634 "Improving the Efficiency of Militarized Interstate Dispute Data Collection using 
##  Automated Textual Analysis" and SES-0924240, "MID4: Updating the Militarized Dispute Data Set 
##  2002-2010." 
##
##	Copyright (c) 2010  Philip A. Schrodt.  All rights reserved.
##
## 	Redistribution and use in source and binary forms, with or without modification,
## 	are permitted under the terms of the GNU General Public License:
## 	http://www.opensource.org/licenses/gpl-license.html
##
##	Report bugs to: schrodt@psu.edu
##
##	For plausible indenting of this source code, set the tab size in your editor to "2"
##
##  REVISION HISTORY:
##  20-Jul-08:  Initial version
##  06-Dec-08 :  Assorted revisions to handle "richer" text formats in the new downloads
##  23-Dec-08 :  Still more of the above
##  09-Dec-09 :  Vers .2B -- modified to count all country.codes.txt occurrences; also adds newswire source
##  26-Feb-10 :  Vers .2B02 -- assorted mods to improve the country counts
##  31-Mar-10 :  Vers .2B03 -- minor mods for final formatting suite
##  

# ======== globals =========== #

$file_list   = "class.filter.files";  
$file_edit   = "class.filter.edit";
$word_file   = "LN2001all.vocab.index.txt";
$nat_file    = "CountryCodes.MIDNLP.txt";

$prefix = "MIDSVM";
$file_out    = ".new.output.txt";
$file_domes  = ".domestic.cases.txt";
$file_reject = ".reject.cases.txt";
$file_natns = ".natns.cases.txt";

$DYAD_LINES = 12; # number of lines to check for the mention of two countries

# ======== subroutines =========== #

sub do_store {  # store $natn and $code
	if(length($sstr) > 3) {  # bypasses some junk in CountryCodes.txt
		$natn[$nnat] = $sstr; 
		$code[$nnat] = $ccode;
		++$nnat;
	}
}

sub hashValueDescendingNum {  # used to sort the nat references
   $natref{$b} <=> $natref{$a};
}

# ======== main program =========== #

print "Running\n";

#read classification words file

%classwords = ();  # initialize hash
open(DAT, $word_file) || die("Could not open classification words file $word_file");
$totwords=0;
while ($word = <DAT>) {
	$totwords++;
	$word = substr($word,0,index($word," "));
	$classwords{$word} = $totwords;
	$classwords{uc($word)} = $totwords;
}
close(DAT);
chomp(@classwords);

#$ka=0;
#while ($ka < 20) {
#  print $classwords[$ka],"\n";
#  $ka++;
#}
#exit;

#read nationalities file
$natn[0] = " US ";  # deal with special short cases
$code[0] = "USA";
$natn[1] = "U S ";  
$code[1] = "USA";
$natn[2] = " UK "; 
$code[2] = "GBR";
$natn[3] = "U K "; 
$code[3] = "GBR";
$natn[4] = " UN "; 
$code[4] = "UNO";
$natn[5] = " UN\'S "; 
$code[5] = "UNO";
$natn[6] = "U N "; 
$code[6] = "UNO";
$natn[7] = "UNITED NATIONS "; 
$code[7] = "UNO";
$nnat = 8;
open(DAT, $nat_file) || die("Could not open $nat_file file");
while ($line = <DAT>) {
	if ($line =~ m/CountryCode>/) {
		$line =~ m/>(\w+)</;
		$ccode = $1;
	}
	elsif (($line =~ m/CountryName>/) || ($line =~ m/Capital>/)) {
		$line =~ m/>(.+)</;
		$sstr = $1; 
		$sstr =~ tr/_/ /; #remove the underscores
		$sstr =~ s/\'/\\\'/g;  #escape the ' -- occurs in some island names
		$temp = $sstr;   # save before adding blank
		$sstr .= " ";  # add final blank
		do_store;
		$sstr = $temp."\\\'s "; #form possessive
		do_store;
	}
	elsif ($line =~ m/Nationality>/) {
		$line =~ m/>(\w+)</;
		$sstr = $1; 
		$sstr =~ tr/_/ /;
		$sstr =~ s/\'/\\\'/g;
		$temp = $sstr;   # save before adding blank
		$sstr .= " ";  # add final blank
		do_store;
		if ($temp !~ m/S\Z/) {
			$sstr = $temp."s "; #form plural
			do_store;
		}
	}
	elsif ($line =~ m/MajorCities>/) {
		while ($line = <DAT>) {
			if ($line =~ m/MajorCities>/) {last;}
			$line =~ m/\s+(\w+)/;
			$sstr = $1; 
			$sstr =~ tr/_/ /;
			$sstr .= " ";  # add final blank
			do_store;
		}
	}
#	if ($nnat > 64) {last;}
}
close(DAT);
#for ($ka = 0; $ka < $nnat; ++$ka) {	print "$ka   \"$natn[$ka]\"   $code[$ka]\n";}
#exit;

open(FDIR,$file_list)             or die "Can\'t open list of input files $file_list; error $!";
open(FOUT,">$prefix$file_out")    or die "Can\'t open output file $file_out; error $!";
open(FDOM,">$prefix$file_domes")  or die "Can\'t open output file $file_domes; error $!";
open(FREJ,">$prefix$file_reject") or die "Can\'t open output file $file_reject; error $!";
open(FNAT,">$prefix$file_natns")  or die "Can\'t open output file $file_natns; error $!";
open(FEDT,">$file_edit")          or die "Can\'t open output file $file_edit; error $!";

$kb = 0;   # count of files processed
while ($file = <FDIR>) {	# file loop
#	if (++$kb > 1) {last}; # debug
#	if ($file !~ m/^Reut/) {next;}   # skip files that don't contain data
	chomp($file);
	if (($file !~ m/0$/) || ($file !~ m/\.\d/)) { next;}  # skip files that don't have a name generated by dup.pl 
	print FEDT $file,"\n";  # write to the edited class.filter.files
	print "\nProcessing $file\n";
	print FDOM "\nProcessing $file\n";
	print FREJ "\nProcessing $file\n";
	open(FIN,"$file")       or die "Can\'t open input file $file; error $!";
			 
	### main record processing loop ###

	# skip file header
	while ($line = <FIN>) {
		if ($line =~ m/---------------------------------/) {last;}
	}
		
	$kz = 0;  # debug;
	while(!eof FIN) {
#	  if ($kz++ > 10) {last;}  # debug -- just does a few records per file
	# get ID
		chomp($line = <FIN>);
		chomp($idstring = <FIN>);
		$idstring = $idstring."\t".$line; # Factiva ID + headline

#		print ">> $idstring\n";
		
		if (($idstring =~ m/WEATHER/) 
				|| ($idstring =~ m/Dow Jones/)
				|| ($idstring =~ m/STOCK ALERT/)
				|| ($idstring =~ m/SHOW: /)) { # not news, so skip rest of story
			$isreject = 10;
			$total = 0;
			$hasdyad = 10; # this prevents writing to FDOM
			print FREJ "$idstring\n"; 
			print "Reject: $idstring\n";
			while ($line = <FIN>)  { 
				if ($line =~ m/---------------------------------/) {last;}
			}
		}
		else { $isreject = 0;}

		while ($line = <FIN>) {  # read remainder of header
			if ($line =~ m/\(c\)/) {	# news source
				chomp($line);
				$idstring = $idstring."\t".$line; 
			}
			if (length($line) < 2) {last;}
		}
		
	# process main text
		$hasdyad = 0;   # dyad detection
		$gotnat  = 0;
		$nline   = 0;
		%natref = ();   # initialize hash natref
		
		$total = 0;     # total words found
		for($ka=1; $ka <= $totwords; $ka++) {$countwords[$ka] = 0;}
		
		while ($line = <FIN>)  { 
			if ($line =~ m/---------------------------------/) {
				$nline = $DYAD_LINES;  # signal that we checked everything
				if (0 == $hasdyad) { $total = 0;}   # this by-passes the record writing if no dyad
				last;
			}
			if (length($line) < 64) {next;}  # need to skip blanks, but otherwise Factiva has little junk

			++$nline;
			if (($nline >= $DYAD_LINES) &&  (0 == $hasdyad)) { # no dyad, so skip rest of story
				while ($line = <FIN>)  { 
					if ($line =~ m/---------------------------------/) {last;}
				}
				$total = 0;
				last;
			}
#		  if ($nline < $DYAD_LINES) { print $line;} #debug
#			print $line; # debug
			
			$edline = $line;
			$edline =~ tr/-,\./   /;  # get rid of punctuation
			$edline =~ s/ us / ux /g;  # eliminate lc "us"  
			$ka = 0;
			while ($ka < $nnat) {
				if ($edline =~ m/\b$natn[$ka]/i) { # Two notes on this:
																					 #   1. This will miss multi-word references that cross lines, but should be close enough
																					 #   2. Should this be \b$natn[$ka]\b, and then I wouldn't need the trailing blank? -- that is more general... <10.03.03>
					$edline = $';   # save the rest of the string
#					print $line; # debug
#					print $edline; # debug
#					print "$ka  $code[$ka]  $natn[$ka]\n";   # debug
					if (exists($natref{$code[$ka]})) {++$natref{$code[$ka]};}
					else {      # new dyad
						$natref{$code[$ka]} = 1;
						if ($gotnat) { $hasdyad = 1 }  # second reference, so we're okay
						else {$gotnat = 1;}
					}
					$ka = -1;  # restart the search on the remainder of the string
				}
				++$ka;
			}

			@allwords = split(/ /,$line);
			foreach $word (@allwords) {
				$word =~ s/["',\.\?\n]//g;   # remove punctuation 
				
				if ($word =~ m/\A[A-Z]/) {next;} # remove capitalized words
				if ($word =~ m/[0-9]/) {next;} # remove anything containing numbers
				if ($word =~ m/-[A-Z]/) {next;} # remove "al-" and "el-" proper nouns
				if (length($word) <= 2) {next}  # remove some additional junk, also preempts some stop words
				
				 # get the counts for the classification words
				if (exists($classwords{$word})) { 
					++$countwords[$classwords{$word}];
					++$total;
#					print $word,"  ", $classwords{$word},"  ",$total,"\n";
				}
			
			} # $word


		} # $line
		
		# find the two most frequent code[] in %natref
		$max1val = 0;
		$max1nat = "---";
		$max2val = 0;
		$max2nat = "---";
		while ( ($key, $value) = each(%natref)) {
			if ($value > $max1val) {
				$max2nat = $max1nat;
				$max2val = $max1val;
				$max1nat = $key;
				$max1val = $value;
			}
			elsif ($value > $max2val) {
				$max2nat = $key;
				$max2val = $value;
			}
#			if ($kz < 8) { print "$key   $value  $max1nat   $max2nat\n";} #debug
		}		
		
		# write the svm_light record
		if ($total > 0) {
			print "$idstring : $max1nat-$max2nat\n";
#			print "\n"; # debug
			print FOUT "0 ";
			for($ka=1; $ka <= $totwords; $ka++)	{
				if ($countwords[$ka] > 0) {printf FOUT "%4d:%6.4f ",$ka,($countwords[$ka]/$total);}
			}
			print FOUT "# $idstring\t$max1nat\t$max2nat\n";

			#write the state-count record
			$idstring =~ m/\t/;  # just get the first field
			print FNAT "$`\t$max1nat\t$max2nat\t";
			if ($max1val > 0) { $ratio = $max2val/$max1val;}
			else { $ratio = 0.0; }  # not sure how this ever occurred but it did after 100K cases in LN2001
			printf FNAT "%6.4f", $ratio;
			foreach $key (sort hashValueDescendingNum (keys(%natref))) { print FNAT "\t$key : $natref{$key}";}
			print FNAT "\n"
		}
		elsif (0 == $hasdyad) {	
			print FDOM "# $idstring\n"; 
			print "No dyad: $idstring\n";
#			print "\n"; # debug
		}
		else {	# uc vocabularly has been added, so we shouldn't hit this now
			print FDOM "All uppercase # $idstring\n"; 
			print "All uppercase: $idstring\n";
#			print "\n"; # debug
		}

	}  # while !eof
	close(FIN);
	
} # end file loop

close(FDIR);
close(FOUT);
close(FDOM);
close(FREJ);

# replace class.filter.files
close(FEDT);
unlink $file_list;
rename $file_edit, $file_list;

print "Program has finished!\n";
exit;
